***prepare data for 2022 applicants

cd E:\NTU\work\submit_SS

*****************HIGH SCHOOL BASIC INFORMATION**********************************
/*this file extracts data for high school information from Ministry of Education's webpage. 
The rawdata can be downloaded from the following website:
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fdetail%2F110%2F110_base2.xls&wdOrigin=BROWSELINK
*/
clear
**import data for high school information from the first worksheet in 110_base2.xls
import excel "110_base2.xls", sheet("110_base2") cellrange(A3:AG3846) firstrow
drop if 學程等級名稱=="附設國中部" /*drop affiliated junior high schools*/ 

*calculate number of academic track senior students by gender by high schools 
preserve 
keep if 群別名稱=="學術群" /*keep only academic track students*/
collapse (sum) academicM3=三年級男 academicF3=三年級女 (first) 學校名稱2=學校名稱 , by(學校代碼)
/*collapse data to obtain no. of senior students by gender*/
sort  學校代碼 
label var academicM3 "no. of academic-track male senior student in R's high school"
label var academicF3 "no. of academic-track female senior student in R's high school'"
save "academic110.dta", replace
restore 

*calculate total number of senior students (academic + vocation track) by gender by schools 
preserve
collapse (sum) HM3=三年級男 HF3=三年級女 (first) 學校名稱3=學校名稱, by(學校代碼)
save "highsch110.dta", replace
restore 

/**imporat population data from Ministry of Interior, Department of Household Registration, M.O.I. 
the rawdata can be downloaded from the following website:
https://www.ris.gov.tw/app/portal/346
the original file "鄉鎮市區人口數按性別及年齡分(96-110).xls" is renamed to "pop_gender_age110.xls"
compute population aged 17-18 in each administrative district (township, town, city, or district)
*/
clear
import excel  using  "pop_gender_age110.xls", sheet("110") cellrange(A6:DC790) firstrow clear allstring
keep gender areacode areaname city cityarea age_tot age17 age18  
sort cityarea
destring age_tot age17 age18, replace
*save "e:/NTU/work/age1718.dta", replace
collapse (sum) allpop=age_tot all17=age17 all18 = age18 (first) areacode city, by(cityarea)
clonevar areacode1 = areacode
destring areacode1, replace 
gen test = mod(areacode1, 1000)
drop if test == 0 /*drop the cases contained the subtotal in the city*/
drop test areacode1 
sort cityarea
save "all1718Y111.dta", replace

**import data for high school address and zipcode 
/*the rawdata can be downloaded from the website of Ministry of Education
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fschool%2F110%2Fhigh.xls&wdOrigin=BROWSELINK
*/
*import the file from excel spreadsheet and save the file as 110high_addr.dta 
clear 
import excel "110_high.xls", sheet("bc31") cellrange(A3:H518) firstrow allstring 
sort 學校代碼
save  "110high_addr.dta", replace

**merge high school address and zipcode (110high_addr.dta) with high school student data (highsch110.dta)
use "highsch110.dta", clear /*read in high school student data file*/
sort 學校代碼
merge 1:1 學校代碼 using 110high_addr.dta /*use school code to merge with high school address data*/
list	學校名稱	學校名稱3	學校代碼 就讀高中	_merge	if	_merge	<3 /*14 schools are missing*/
replace	 就讀高中=學校名稱3 if	_merge	==1 /*replace missing high school names*/
drop _merge
sort 學校代碼

**merge with number of academic track senior students 
merge 1:1 學校代碼 using academic110.dta
drop 學校名稱2 
drop _merge
replace academicM3 = 0 if HM3~=. & academicM3==.
replace academicF3 = 0 if HF3~=. & academicF3==.
sort 就讀高中

*calculate total number of senior students (male+female) 
gen tot3=HM3+HF3
label var tot3 "total number of senior students in R's high school"

**create percentage of academic track in high school 
gen academic3=academicM3+academicF3
label var academic3 "total no. of senior academic track students"
gen academicp =academic3/tot3
label var academicp "percentage of senior academic track students"
sort 學校代碼
save  "hsch_info110.dta", replace
************END OF HIGH SCHOOL BASIC INFORMATION********************************

***READ HOUSING PRICE INFORMATION **********************************************
/*the housing price information is taken from the following website https://price.houseprice.tw/list/
the python program to extract housing price information is stored in Replicatoin_Addr2HousePrice.zip
we provide a query dataset with 10 psudo address to test run the program */
*read housing price information
clear
import excel "house_price_result111.xlsx", sheet("Sheet1") cellrange(A1:N4303) firstrow
sort No
save  "house_price111.dta", replace
*******************************************************************************


***READ 2022 NTU applicants data **************************************************
*the data of student applicants are stored in three different worksheets
*the following section read the three worksheets and combined them into one file, the three worksheets contained information on: (1) those who failed the first round selection (2) those who passed 1st round but did not participate the 2nd round selection (3) those who entered the 2nd round the data 
**(1) read applicants who did not pass the first round selection
set more off
clear
import excel "applicant111.xlsx", sheet("111-未通過一階") cellrange(A1:AL7376) firstrow 
destring  郵遞區號 , force replace 
destring 學測應試號碼, force replace
gen dsource = 1
save "fail111.dta", replace

clear
**(2)read student applicants who passed the first round but did not enter the second round 
import excel "applicant111.xlsx", sheet("111-進二階+但未繳費") cellrange(A1:AO930) firstrow 
gen dsource = 2
save "gaveup111.dta", replace

destring 校系代碼, force replace 
**append data  (1)+(2)
append using "fail111.dta"
save "notin2nd111.dta", replace 

*(3) read applicants who passed both 1st and 2nd round 
clear
import excel "applicant111.xlsx", sheet("111-進二階") cellrange(A1:BC4480) firstrow
destring 學測應試號碼 校系代碼 畢業年份 戶籍郵遞, force replace 

**append data (1)+(2)+(3)
append using "notin2nd111.dta"
replace dsource = 3 if dsource==.
label define dsource 1 "failed 1st round" 2 "gave up 2nd round" 3 "in 2nd round"
label value dsource dsource

drop if 類別 =="希望組" /*drop 257 students from financially disadvantaged family who are on 'Hope Scholarship' and did not compete with other domestic undergraduate student*/

*rename "Bachelor program of international sports affairs" to be consistent with other files
replace 學系組名稱 = "國際體育運動事務學士學位學程" if 學系組名稱=="國際體育運動事務學程/共同教育中心"
 
clonevar 就讀高中 = s_edu_name /*make a copy of the original high school name*/

*correct the high school names in NTU's data to match those used by the Ministry of Education for file merging 修正台大資料中的高中名稱，使其與教育部的名稱相同，以便合併檔案
replace	就讀高中=	"國立中山大學附屬國光高中"	if 就讀高中==	"國立中山大學附中"
replace	就讀高中=	"國立中科實驗高級中學"		if 就讀高中==	"國立中科實驗高中"
replace	就讀高中=	"國立興大附中"				if 就讀高中==	"國立中興大學附中"
replace	就讀高中=	"國立竹科實驗高級中等學校"	if 就讀高中==	"國立新竹科學園區實驗高中"
replace	就讀高中=	"國立師大附中"				if 就讀高中==	"國立臺灣師大附中"
replace	就讀高中=	"國立華僑高級中等學校"		if 就讀高中==	"國立華僑高中"
replace	就讀高中=	"國立高師大附中"			if 就讀高中==	"國立高雄師大附中"
replace	就讀高中=	"私立再興中學"				if 就讀高中==	"私立再興高中"
replace	就讀高中=	"財團法人南山高中"			if 就讀高中==	"私立南山高中"
replace	就讀高中=	"桃園市啟英高中"			if 就讀高中==	"私立啟英高中"
replace	就讀高中=	"新北市崇光高中"			if 就讀高中==	"私立崇光高中"
replace	就讀高中=	"私立延平中學"				if 就讀高中==	"私立延平高中"
replace	就讀高中=	"桃園市復旦高中"			if 就讀高中==	"私立復旦高中"
replace	就讀高中=	"財團法人慈濟高中"			if 就讀高中==	"私立慈濟高中"
replace	就讀高中=	"桃園市新興高中"			if 就讀高中==	"私立新興高中"
replace	就讀高中=	"桃園市治平高中"			if 就讀高中==	"私立治平高中"
replace	就讀高中=	"財團法人聖功女中"			if 就讀高中==	"私立聖功女中"
replace	就讀高中=	"臺南市興國高中"			if 就讀高中==	"私立興國高中"
replace	就讀高中=	"新北市裕德高級中等學校"	if 就讀高中==	"私立裕德高中"
replace	就讀高中=	"財團法人辭修高中"			if 就讀高中==	"私立辭修高中"
replace	就讀高中=	"天主教道明中學"			if 就讀高中==	"私立道明高中"
replace	就讀高中=	"臺北市靜心高中"			if 就讀高中==	"私立靜心高中"
replace	就讀高中=	"臺北市立建國中學"			if 就讀高中==	"臺北市立建國高中"
replace	就讀高中=	"臺北市立成功中學"			if 就讀高中==	"臺北市立成功高中"
replace	就讀高中=	"臺北市立北一女中"			if 就讀高中==	"臺北市立第一女中"
replace	就讀高中=	"高雄市立高雄中學"			if 就讀高中==	"高雄市立高雄高中"

sort 就讀高中

*replace department names missing from department codes for applicants failed 1st round
replace	學系組名稱	=	"中國文學系"				if	校系代碼==	1012	&	學系組名稱	==""
replace	學系組名稱	=	"外國語文學系"				if	校系代碼==	1022	&	學系組名稱	==""
replace	學系組名稱	=	"歷史學系"					if	校系代碼==	1032	&	學系組名稱	==""
replace	學系組名稱	=	"哲學系"					if	校系代碼==	1042	&	學系組名稱	==""
replace	學系組名稱	=	"人類學系"					if	校系代碼==	1052	&	學系組名稱	==""
replace	學系組名稱	=	"圖書資訊學系"				if	校系代碼==	1062	&	學系組名稱	==""
replace	學系組名稱	=	"日本語文學系"				if	校系代碼==	1072	&	學系組名稱	==""
replace	學系組名稱	=	"戲劇學系(女)"				if	校系代碼==	1082	&	學系組名稱	==""
replace	學系組名稱	=	"數學系"					if	校系代碼==	1092	&	學系組名稱	==""
replace	學系組名稱	=	"物理學系"					if	校系代碼==	1102	&	學系組名稱	==""
replace	學系組名稱	=	"化學系"					if	校系代碼==	1112	&	學系組名稱	==""
replace	學系組名稱	=	"地質科學系"				if	校系代碼==	1122	&	學系組名稱	==""
replace	學系組名稱	=	"心理學系"					if	校系代碼==	1132	&	學系組名稱	==""
replace	學系組名稱	=	"地理環境資源學系"			if	校系代碼==	1142	&	學系組名稱	==""
replace	學系組名稱	=	"大氣科學系"				if	校系代碼==	1152	&	學系組名稱	==""
replace	學系組名稱	=	"政治學系政治理論組"		if	校系代碼==	1162	&	學系組名稱	==""
replace	學系組名稱	=	"政治學系國際關係組"		if	校系代碼==	1172	&	學系組名稱	==""
replace	學系組名稱	=	"政治學系公共行政組"		if	校系代碼==	1182	&	學系組名稱	==""
replace	學系組名稱	=	"經濟學系A組"				if	校系代碼==	1192	&	學系組名稱	==""
replace	學系組名稱	=	"經濟學系B組"				if	校系代碼==	1202	&	學系組名稱	==""
replace	學系組名稱	=	"經濟學系C組"				if	校系代碼==	1212	&	學系組名稱	==""
replace	學系組名稱	=	"社會學系"					if	校系代碼==	1222	&	學系組名稱	==""
replace	學系組名稱	=	"社會工作學系"				if	校系代碼==	1232	&	學系組名稱	==""
replace	學系組名稱	=	"醫學系"					if	校系代碼==	1242	&	學系組名稱	==""
replace	學系組名稱	=	"醫學系(公費生)"			if	校系代碼==	1252	&	學系組名稱	==""
replace	學系組名稱	=	"牙醫學系"					if	校系代碼==	1262	&	學系組名稱	==""
replace	學系組名稱	=	"藥學系"					if	校系代碼==	1272	&	學系組名稱	==""
replace	學系組名稱	=	"醫學檢驗暨生物技術學系"	if	校系代碼==	1282	&	學系組名稱	==""
replace	學系組名稱	=	"護理學系"					if	校系代碼==	1292	&	學系組名稱	==""
replace	學系組名稱	=	"物理治療學系"				if	校系代碼==	1302	&	學系組名稱	==""
replace	學系組名稱	=	"職能治療學系"				if	校系代碼==	1312	&	學系組名稱	==""
replace	學系組名稱	=	"土木工程學系"				if	校系代碼==	1322	&	學系組名稱	==""
replace	學系組名稱	=	"機械工程學系"				if	校系代碼==	1332	&	學系組名稱	==""
replace	學系組名稱	=	"化學工程學系"				if	校系代碼==	1342	&	學系組名稱	==""
replace	學系組名稱	=	"工程科學及海洋工程學系"	if	校系代碼==	1352	&	學系組名稱	==""
replace	學系組名稱	=	"材料科學與工程學系"		if	校系代碼==	1362	&	學系組名稱	==""
replace	學系組名稱	=	"醫學工程學系"				if	校系代碼==	1372	&	學系組名稱	==""
replace	學系組名稱	=	"農藝學系"					if	校系代碼==	1382	&	學系組名稱	==""
replace	學系組名稱	=	"生物環境系統工程學系"		if	校系代碼==	1392	&	學系組名稱	==""
replace	學系組名稱	=	"農業化學系"				if	校系代碼==	1402	&	學系組名稱	==""
replace	學系組名稱	=	"森林環境暨資源學系"		if	校系代碼==	1412	&	學系組名稱	==""
replace	學系組名稱	=	"動物科學技術學系"			if	校系代碼==	1422	&	學系組名稱	==""
replace	學系組名稱	=	"農業經濟學系"				if	校系代碼==	1432	&	學系組名稱	==""
replace	學系組名稱	=	"園藝暨景觀學系"			if	校系代碼==	1442	&	學系組名稱	==""
replace	學系組名稱	=	"獸醫學系"					if	校系代碼==	1452	&	學系組名稱	==""
replace	學系組名稱	=	"生物產業傳播暨發展學系"	if	校系代碼==	1462	&	學系組名稱	==""
replace	學系組名稱	=	"生物機電工程學系"			if	校系代碼==	1472	&	學系組名稱	==""
replace	學系組名稱	=	"昆蟲學系"					if	校系代碼==	1482	&	學系組名稱	==""
replace	學系組名稱	=	"植物病理與微生物學系"		if	校系代碼==	1492	&	學系組名稱	==""
replace	學系組名稱	=	"工商管理學系企業管理組"	if	校系代碼==	1502	&	學系組名稱	==""
replace	學系組名稱	=	"工商管理學系科技管理組"	if	校系代碼==	1512	&	學系組名稱	==""
replace	學系組名稱	=	"會計學系"					if	校系代碼==	1522	&	學系組名稱	==""
replace	學系組名稱	=	"財務金融學系"				if	校系代碼==	1532	&	學系組名稱	==""
replace	學系組名稱	=	"國際企業學系"				if	校系代碼==	1542	&	學系組名稱	==""
replace	學系組名稱	=	"資訊管理學系"				if	校系代碼==	1552	&	學系組名稱	==""
replace	學系組名稱	=	"公共衛生學系"				if	校系代碼==	1562	&	學系組名稱	==""
replace	學系組名稱	=	"電機工程學系"				if	校系代碼==	1572	&	學系組名稱	==""
replace	學系組名稱	=	"資訊工程學系"				if	校系代碼==	1582	&	學系組名稱	==""
replace	學系組名稱	=	"資訊工程學系(APCS組)"		if	校系代碼==	1592	&	學系組名稱	==""
replace	學系組名稱	=	"法律學系法學組"			if	校系代碼==	1602	&	學系組名稱	==""
replace	學系組名稱	=	"法律學系司法組"			if	校系代碼==	1612	&	學系組名稱	==""
replace	學系組名稱	=	"法律學系財經法學組"		if	校系代碼==	1622	&	學系組名稱	==""
replace	學系組名稱	=	"生命科學系"				if	校系代碼==	1632	&	學系組名稱	==""
replace	學系組名稱	=	"生化科技學系"				if	校系代碼==	1642	&	學系組名稱	==""
replace	學系組名稱	=	"國際體育運動事務學士學位學程"	if	校系代碼==	1652	&	學系組名稱	==""
replace	學系組名稱	=	"希望組甲組(文社法)"		if	校系代碼==	1662	&	學系組名稱	==""
replace	學系組名稱	=	"希望組乙組(政經地理)"		if	校系代碼==	1672	&	學系組名稱	==""
replace	學系組名稱	=	"希望組丙組(理)"			if	校系代碼==	1682	&	學系組名稱	==""
replace	學系組名稱	=	"希望組丁組(生醫)"			if	校系代碼==	1692	&	學系組名稱	==""
replace	學系組名稱	=	"希望組戊組(工電資)"		if	校系代碼==	1702	&	學系組名稱	==""
replace	學系組名稱	=	"希望組己組(商管)"			if	校系代碼==	1712	&	學系組名稱	==""
replace	學系組名稱	=	"希望組庚組(生農)"			if	校系代碼==	1722	&	學系組名稱	==""
replace	學系組名稱	=	"希望組辛組(醫牙)"			if	校系代碼==	1732	&	學系組名稱	==""

encode 學系組名稱, generate(dept)
*generate college names from the department code
replace 學院 = "公共衛生學院"  		if 校系代碼 == 1562 & 學院 ==""
replace 學院 = "共同教育中心"  		if 校系代碼 == 1652 & 學院 ==""
replace 學院 = "工學院"  			if 校系代碼 >= 1322 & 校系代碼<=1372  & 學院 ==""
replace 學院 = "文學院"  			if 校系代碼 >= 1012 & 校系代碼<=1082  & 學院 ==""
replace 學院 = "法律學院"  			if 校系代碼 >= 1602 & 校系代碼<=1622  & 學院 ==""
replace 學院 = "理學院"  			if 校系代碼 >= 1092 & 校系代碼<=1152  & 學院 ==""
replace 學院 = "生命科學院"  		if 校系代碼 >= 1632 & 校系代碼<= 1642  & 學院 ==""
replace 學院 = "生物資源暨農學院"  	if 校系代碼 >=  1382 & 校系代碼<=1492  & 學院 ==""
replace 學院 = "社會科學院"  		if 校系代碼 >=  1162 & 校系代碼<=1232  & 學院 ==""
replace 學院 = "管理學院"  			if 校系代碼 >=  1502 & 校系代碼<=1552  & 學院 ==""
replace 學院 = "醫學院"  			if 校系代碼 >=  1242  & 校系代碼<=1312  & 學院 ==""
replace 學院 = "電機資訊學院"  		if 校系代碼 >=1572 & 校系代碼 <= 1592  & 學院 ==""

*create college codes
encode 學院, generate(college)

*create gender variable
encode 性別, generate(gender)
gen female = gender==1
rename s_number 學校代碼 /*rename high school code variable*/

***correct high school names and codes  
replace 就讀高中 = "市立大同高中" if 就讀高中 == "私立大同高中"
replace 學校代碼 = "343302" if 學校代碼 == "341302"

**add city names (as prefix) to school names to distinguish high schools with same names but located in different districts 
replace 就讀高中 = "臺北市立中正高中" if 學校代碼 =="423302"
replace 就讀高中 = "高雄市立中正高中" if 學校代碼 =="583301"
replace 就讀高中 = "新北市立新莊高中" if 學校代碼 =="013337"
replace 就讀高中 = "高雄市立新莊高中" if 學校代碼 =="533302"
replace 就讀高中 = "臺北市立陽明高中" if 學校代碼 =="413301"
replace 就讀高中 = "桃園市立陽明高中" if 學校代碼 =="033325"

replace 學校代碼 ="0" if 學校代碼 =="" /*assign code 0 to forign high school or alternative education*/ 

**merge high school information from hsch_info.dta 
merge m:1 學校代碼 using  "hsch_info110.dta"
*drop cases from using file
drop if _merge==2

********merge population of aged 17-18 from all1718Y111.dta*************************
**extract the names of adminstrative district from student's household address
*manually correct the names of adminstrative district due to data parsing failure
clonevar cityarea = 戶籍地址區
replace cityarea =	"台南市官田區"	if cityarea ==	"台南市官田區渡拔里渡子頭05鄰87-2號"
replace cityarea =	"屏東縣潮州鎮"	if cityarea ==	"屏東縣潮州鎮敦化路38號"
replace cityarea =	"彰化縣芬園鄉"	if cityarea ==	"彰化縣芬園鄉舊社村新興路一段507號"
replace cityarea =	"彰化縣芬園鄉"	if cityarea ==	"彰化縣芬園鄉芬園村009鄰南妙街213號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市中正路一段392號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市五權里002鄰彰和路一段1號之12"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市介壽新村198號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市大埔路439巷25號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市寶? 路200號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市延和里8鄰埔西街160巷3-12號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市彰南路一段43號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市彰安里010鄰民族路253巷95號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市精誠路113-50號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市萬安里029鄰建國南路36巷5號九樓"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市西勢里辭修路316巷19-8號"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市金馬路三段561號9樓"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市金馬路二段483號11樓"
replace cityarea =	"彰化縣彰化市"	if cityarea ==	"彰化縣彰化市長順街23號"
replace cityarea =	"臺中市烏日區"	if cityarea ==	"臺中市烏日區前竹里仁義街26號"
replace cityarea =	"臺中市烏日區"	if cityarea ==	"臺中市烏日區振興街42-9號"
replace cityarea =	"臺中市烏日區"	if cityarea ==	"臺中市烏日區興學街21巷6號"
**replace the word 台 with 臺 if any
replace 	cityarea=	"臺中市北屯區"	if cityarea==	"台中市北屯區"
replace 	cityarea=	"臺中市大里區"	if cityarea==	"台中市大里區"
replace 	cityarea=	"臺北市萬華區"	if cityarea==	"台北市萬華區"
replace 	cityarea=	"臺南市安南區"	if cityarea==	"台南市安南區"
replace 	cityarea=	"臺南市官田區"	if cityarea==	"台南市官田區"
replace 	cityarea=	"臺東縣卑南鄉"	if cityarea==	"台東縣卑南鄉"
replace 	cityarea=	"臺東縣臺東市"	if cityarea==	"台東縣台東市"
replace 	cityarea=	"臺東縣池上鄉"	if cityarea==	"台東縣池上鄉"
replace 	cityarea=	"臺東縣綠島鄉"	if cityarea==	"台東縣綠島鄉"
replace 	cityarea=	"嘉義市西區"	if cityarea==	"嘉義市新榮路"
replace 	cityarea=	"臺中市南屯區"	if cityarea==	"臺中市南屯"
**replace the missing district names based on the street names
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市世賢路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市保健街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市光彩街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市公明路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市博愛路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市博東路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市向榮街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市吳鳳南"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市國華街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市垂楊路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市大平街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市忠義街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市文雅街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市林森東"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市武昌街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市永樂二"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市立仁路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市維和街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市維新路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市育人路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市蘭潭里"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市西區文"
replace cityarea=	"屏東縣屏東市"	if cityarea ==	"屏東縣屏東縣"
replace cityarea=	"彰化縣員林市"	if cityarea ==	"彰化縣員林鎮"
replace cityarea=	"新北市八里區"	if cityarea ==	"新北市八里"
replace cityarea=	"新北市永和區"	if cityarea ==	"新北市永和保"
replace cityarea=	"新北市永和區"	if cityarea ==	"新北市永和郵"
replace cityarea=	"新北市萬里區"	if cityarea ==	"新北市萬里"
replace cityarea=	"新北市蘆洲區"	if cityarea ==	"新北市蘆洲玉"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市三民路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市中央路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市中正路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中清路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中華路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中雅里"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市光復路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市光華二"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市公道五"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市北大路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市國光街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市園後街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市埔頂一"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市學府路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市安富街"
replace cityarea=	"新竹市香山區"	if cityarea ==	"新竹市富群街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市延平路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市建中一"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市建功二"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市建興街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市忠孝路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市慈雲路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市明湖路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東區竹"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東南街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市東大路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東山街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市東門街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市武陵路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市民權路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市民生路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市湳中街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市湳雅街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市磐石里"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市福德里"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市竹光路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市經國路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市西大路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市西門街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市鐵道路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市關新路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市關東路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市食品路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市高峰路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市龍山東"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區柳"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區民"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區自"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區大"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區英"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區青"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區三"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區復"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區柳"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區樹"
replace cityarea=	"臺中市東區"	if cityarea ==	"臺中市東區東"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區五"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區公"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區昇"
replace cityarea=	"臺中市西屯區"	if cityarea ==	"臺中市西屯福"
replace cityarea=	"臺北市萬華區"	if cityarea ==	"臺北市西藏路"
replace cityarea=	"臺南市佳里區"	if cityarea ==	"臺南市佳里"
replace cityarea=	"花蓮縣玉里鎮"	if cityarea ==	"花蓮縣玉里"
replace cityarea=	"高雄市路竹區"	if cityarea ==	"高雄市路"   /*new correction*/
replace cityarea=	"高雄市鼓山區"	if cityarea ==	"高雄市鼓山二"
replace cityarea = "花蓮市玉里鎮" if cityarea =="花蓮市玉里鄉"

sort cityarea 
drop _merge

*merge the population aged 17-18 
merge m:1 cityarea using "all1718Y111.dta"
drop if _merge == 2 /*drop cases only in using file*/

**calculate high school rank = 1st round acceptance rate by high school =
*(no. of students passed 1st round in R's high school)/(total no. of senior students in R's high school)
preserve
keep if dsource >=2 /*select those who passed 1st round*/
bysort 就讀高中: gen hsch_ntu = _N  /*calucate no. of students passed 1st round by high school*/
label var hsch_ntu "no. of students passed 1st round in R's high school"
collapse (mean) hsch_ntu, by(就讀高中)
save "hsch_ntu111.dta", replace
restore 

drop _merge
merge m:1 就讀高中 using hsch_ntu111.dta
recode hsch_ntu (.=0) if _merge==1 /*schools not in the using file have no students passed 1st round*/
gen hschrank = hsch_ntu/tot3 
label var hschrank "high school rank = % of senior students passed 1st round exam"


**update the housing price for those who passed 2nd round
sort No 
drop _merge
merge m:1 No using "house_price111.dta", update

gen housing = 單價中位數_萬
label var housing "median housing price"

**replace 8 missing cases with the price of the nearest neighborhood
list No 戶籍郵遞 戶籍地址 if housing ==., clean
replace housing = 17.25 if (No ==3757 | No == 153) 
replace housing = 31.3  if (No ==2116 | No == 994)	
replace housing = 37.05 if No == 4031 
replace housing = 0.7 if No== 482 | No == 2519 |No== 3201 |No== 4504 |No== 5184
**replace 2 missing cases with the average price from the same zipcode 
sum housing if 戶籍郵遞== 741 & dsource==3
sum housing if 戶籍郵遞== 621 & dsource==3
replace housing = 20.3 if 戶籍郵遞== 741 & housing ==.
replace housing =  12.6 if 戶籍郵遞== 621 & housing ==.

gen lnhousingp=ln(housing)
label var lnhousingp "log(median housing price)"

**combine the sub-field in the same department into one department name 
gen bname = 學系組名稱
replace bname=	"工商管理學系"	if bname==	"工商管理學系企業管理組"	
replace bname=	"工商管理學系"	if bname==	"工商管理學系科技管理組"	
replace bname=	"法律學系"	if bname==	"法律學系司法組"	
replace bname=	"法律學系"	if bname==	"法律學系法學組"	
replace bname=	"法律學系"	if bname==	"法律學系財經法學組"	
replace bname=	"政治學系"	if bname==	"政治學系公共行政組"	
replace bname=	"政治學系"	if bname==	"政治學系政治理論組"	
replace bname=	"政治學系"	if bname==	"政治學系國際關係組"	
replace bname=	"經濟學系"	if bname==	"經濟學系A組"	
replace bname=	"經濟學系"	if bname==	"經濟學系B組"	
replace bname=	"經濟學系"	if bname==	"經濟學系C組"	
replace bname=	"戲劇學系"	if bname==	"戲劇學系(女)"	
replace bname=	"戲劇學系"	if bname==	"戲劇學系(男)"	
replace bname=	"醫學系"	if bname==	"醫學系"	
replace bname=	"醫學系"	if bname==	"醫學系(公費生)"	 
replace bname=  "資訊工程學系" if bname=="資訊工程學系(APCS組)"	
sort bname
drop _merge
label var bname "department name"

/*the following two datasets were downloaded from Ministrey of Education's website:
https://depart.moe.edu.tw/ED4500/News_Content.aspx?n=5A930C32CC6C3818&sms=91B3AAE8C6388B96&s=B7F6EA80CA2F63EE
*/
**merge the students profile of the department 
merge m:1 bname using "NTU_s110.dta"
drop if _merge==2
drop _merge
**merge faculty profile of the department
merge m:1 bname using "NTU_t110.dta"
drop if _merge==2
drop _merge
drop year

***calculate percentage of female students in the department
gen s_malep = mtot/stotal
gen s_femalep =ftot/stotal
label var s_malep "% of male students in the department"
label var s_femalep  "% of female students in the department"
***calculate percentage of fist year female students 
gen s1_malep = m1/(m1+f1)
gen s1_femalep =f1/(m1+f1)
label var s1_malep "% of male 1st yr students in the department"
label var s1_femalep  "% of female 1st yr students in the department"

*calculate the percentage of opposite sex in the department 
gen s_samesex = s_femalep if female==1
replace s_samesex = s_malep if female==0
label var s_samesex "% of same sex students in the department"

**calculate the percentage of male/female faculty in the department
gen t_malep = ttot_m/ttotal
gen t_femalep = ttot_f/ttotal
label var t_malep "% of male faculty in the department"
label var t_femalep "% of female faculty in the department"
gen t_samesex = t_malep if female == 0
replace t_samesex = t_femalep if female==1
label var t_samesex "% of same sex faculty in the department"

**single sex high school- one sex exceeded 95%
gen allmale=malep3>.95
gen allfemale=malep3<.05
label var allmale "all male high school"
label var allfemale "all female high school"

***create variable for application outcomes
gen 	outcome = 3 	if 錄取別=="正取"
replace outcome = 2 	if 錄取別=="備取"
replace outcome = 1 	if 錄取別=="未錄取"
replace outcome = 0 	if dsource==2
replace outcome = -1 	if dsource==1
label var outcome "application outcomes"
label define outcome -1 "did not passed 1st round" 0 "not in 2nd rd" 1 "rejected" 2 "waiting" 3 "admitted" , replace
label value outcome outcome

***create indicator for minority and remote islander status
gen 	minority = 0 if 考生身份 =="一般考生" | 考生身份 =="一般生"
replace minority = 1 if 考生身份 =="原住民生" |  考生身份 =="原住民"  
replace minority = 2 if 考生身份 =="離島考生" | 考生身份 =="離島生" | 考生身份 == "願景計畫生"
label var minority "minority status of student"
label define minority 0 "regular student" 1 "aboriginal" 2 "remote islander"
label value minority minority

**create indicator of whether the respodnent atteded a public or private high school 
gen private =公私立別=="私立"
label var private "attended private high school==1"

*the following do file assign weights to various assessment components, the weights are taken from the application broshure 2022.pdf.
do "score weight111.do"
label var testscorep "weight for GSAT學測佔甄選總成績的%"
label var oralp "weight for interview面試佔甄選總成績的%"
label var writtenp "weight for written exam筆試佔甄試總成績%"
label var reviewp "weight for dossier審查成績佔甄試總成績%"
label var written1 "weight for written test 1筆試1佔甄試總成績%"
label var written2 "weight for written test 2筆試2佔甄試總成績%"
label var written3 "weight for written test 3筆試3佔甄試總成績%"

***create scores for GSAT, in-person interview, dossier review, written test 1-4, pages of dossier 
clonevar exam = 學測成績佔分
clonevar exam100= 學測成績
clonevar review =審查成績佔分
clonevar oral = 口試成績佔分
clonevar pen1 =筆試1
clonevar pen2 =筆試2
clonevar pen3 =筆試3
clonevar pen4 =筆試4
clonevar page =書審資料EP頁數
recode exam review oral pen1 - pen4 (-1=0)

label variable exam100 "Normalized GSAT score"
label variable exam "weighted GSAT score"
label variable review "weighted scores for dossier review"
label variable oral "weighted scores for in-person interview"
label variable pen1 "written test 1 score"
label variable pen2 "written test 2 score"
label variable pen3 "written test 3 score"
label variable pen4 "written test 4 score"
label variable page "pages of dossier"
 
*create variables for GSAT test score by subjects
clonevar chinese = 學測國文級分	
clonevar english = 學測英文級分	
clonevar mathA = 學測數學A級分
clonevar mathB = 學測數學B級分
clonevar social = 學測社會級分
clonevar natural = 學測自然級分
label variable chinese "GSAT score for Chinese"
label variable english "GSAT score for English"
label variable mathA "GSAT score for Mathematics A"
label variable mathB "GSAT score for Mathematics B"
label variable social "GSAT score for social studies"
label variable natural "GSAT score for natural science"

**five departments use GSAT for screening purpose only, including departments of ocean engeneering, mathematics, dentistry, medicine, physics. We assign equal weight to each subject to calculate the normalized GSAT score.  
replace exam100 = [(english + mathA + natural)/45]*100 if (bname =="工程科學及海洋工程學系" | bname =="數學系")  & dsource==3
replace exam100 = [(chinese + english + mathA + natural)/60]*100 if (bname == "牙醫學系" | bname =="醫學系" | bname == "物理學系") & dsource ==3

*calculate the final score for the applicant:
*step 1: sum up weighted scores for GSAT, dossier review and in-person interview
*step 2: adjusted the scores by adding weighted scores from the written tests if written test is required by the department. The weight for written tests can be found in "application broshure 2022.pdf"
egen score = rowtotal(exam review oral)  	if dsource ==3
replace score = score+pen1*0.2+pen2*0.15    if bname=="中國文學系"
replace score = score+pen1*0.45 			if bname=="動物科學技術學系"
replace score = score+pen1*0.30 			if bname=="化學工程學系"
replace score = score+pen1*0.25 			if bname=="化學系" 
replace score = score+pen1*0.30 			if bname=="土木工程學系"
replace score = score+pen1*0.20 			if bname== "外國語文學系"
replace score = score+pen1*0.25 			if bname=="大氣科學系"
replace score = score+pen1*0.55 	 		if bname=="數學系"
replace score = score+pen1*0.25 			if bname=="昆蟲學系"
replace score = score+pen1*0.25 			if bname=="材料科學與工程學系"
replace score = score+pen1*0.15 			if bname=="植物病理與微生物學系"
replace score = score+pen1*0.30 			if bname=="機械工程學系"
replace score = score +pen1*.15+pen2*.2+((pen3+pen4)/2)*.25 if bname == "牙醫學系"
replace score = score+(pen1*.32+pen2*.23)	if bname=="物理學系"
replace score = score+pen1*0.25 			if bname=="生命科學系"
replace score = score+pen1*0.10+pen2*0.10	if bname=="藥學系" 
replace score = score+(pen1)*0.4			if dept==56 /*資工*/
replace score = score+(pen1)*0.2			if dept==57 /*資工APCS*/
replace score = score+pen1*0.25 			if bname=="醫學工程學系"
replace score = score +((pen1+pen2)/2)*.1+((pen3+pen4)/2)*.4 if bname == "醫學系"
replace score = score+pen1*.15+pen2*.15		if bname=="電機工程學系"
replace score = score+(pen1)*0.5			if bname=="工程科學及海洋工程學系" 
sort dept outcome score 
label var score "甄選總成績"

**create rank within the department based on the final score
bysort dept: egen rank= rank(score)  
label var rank "rank order within the department"
**create the percentile rank within the department by dividing the rank by the maximum rank in the department
bysort dept: egen maxrank=max(rank) 
label var maxrank "全系排名最大值"
bysort dept: gen rank100 = (rank/maxrank)*100
label var rank100 "Percentile rank within department"

**create indicator variable "penexam" to indicate whether the department require written exam
gen penexam =writtenp>0	 	
label var penexam "department has additional written exam"
**create indicator "oralexam" to indicate whether the departemnt require oral exam
gen oralexam = oralp >0 
label var oralexam "has oral exam"

**create types of exam required by the department
gen 	examform =1 if oralexam==1 & penexam==0
replace examform =2 if oralexam==0 & penexam==1
replace examform =3 if oralexam==0 & penexam==0
replace examform =4 if oralexam==1 & penexam==1
label var examform "types of exam required"
label define examform 1 "oral=1, written=0" 2 "oral=0, written=1" 3 "oral=0, written=0"  4 "oral=1, written=1"
label value examform examform
*create indicator for students from remote island (kimman)
gen kimman=minority==2
*create indicator for students with aborigin family background
gen aborigin=minority==1
*create indicator for ronin status
gen retake = 畢業年份<111
*create indicator for admiited, rejected, and on the waiting list
gen waiting=outcome==2 
gen admitted=outcome==3
gen rejected=outcome==1

label var kimman "student from remote island = 1"
label var aborigin "student with obriginal background = 1"
label var retake "ronin status = 1"
label var waiting "on waiting list = 1"
label var admitted "admitted = 1"
label var rejected "rejected =1 "

**create high school GPA
clonevar sch_pr = 五學期校平均相對表現
clonevar cluster_pr = 五學期群平均相對表現
clonevar class_pr = 五學期科班學程平均相對表現
clonevar hsch_gpa =  五學期平均成績
encode 科班學程別, generate(class)
**only exam108=1 has high school GPA and PR rank
encode  備審資料, generate(exam108)
destring hsch_gpa, force replace
clonevar id = 學測應試號碼
label variable id "application ID number"
label variable hsch_gpa "High school grade average"

**calculate no of students apply for NTU
gen accepted  = (統一分發後結果=="錄取" | 統一分發後結果 == "放棄")
label var accepted "the applicant was accepted into NTU"

***drop cases
drop if 學校代碼 == "" | 學校代碼=="0"  /* drop 33 students without high school information  total cases 12526-33=12493, for those who entered 2nd round =4312-7=4305*/
drop if dept==9 /*drop 177 cases from newly founded Bachelor Program of International Sports Affairs  total N:  12493-177=12316, dsource == 3: 4305-27=4278*/ 

/*the following syntax calculate the percentage of college graduates in each adminstrative district
the original dataset is available from Ministry of Interior's open data website, can be downloaded from:
https://data.gov.tw/dataset/117988
*/
preserve
clear
use "pop110.dta"  /*read in the original data from MOI*/
gen pop25=pop if age =="25~29歲" 
gen college = pop if (edu == "大畢" |  edu=="碩畢" | edu=="博畢")
gen college25 = pop if age =="25~29歲"  & (edu == "大畢" |  edu=="碩畢" | edu=="博畢")
collapse (sum) pop pop25 college college25, by(區域別)
gen zipcollegep = college/pop
gen zipcollegep25 = college25/pop25
rename 區域別 cityarea
keep cityarea pop pop25 college college25 zipcollegep zipcollegep25
save  "collegep110.dta", replace
restore 

**the following syntax merge percentage of college students at each adminstrative district.
sort cityarea 
merge m:1 cityarea using "collegep110.dta"
drop if _merge==2
drop _merge
label var zipcollegep "% college gradautes in zipcode"
label var zipcollegep25 "% of college gradautes aged 25-29 in zipcode"

save "ntu_2022.dta", replace

